#!/usr/bin/python3

"""
This script reads the resources.arsc files in the malicous_apk
and benign_apk folders and copies the modified file dates into a
JSON file for later analysis

The output data format is as follows:
{"features": ["1222000000_to_1222111111", ...],
 "apps": {"999eca2457729e371355aea5faa38e14.apk": {"vector": [0,0,0,1], "malicious": [0,1]}, ...}}
"""

import os
import json
import glob
import time
import random
from configparser import ConfigParser

__author__='mkkeffeler'

def main():
    config = ConfigParser()
    config.read('config.ini')
    NUM_DATE_BUCKETS = config.getint('AMA', 'NUM_DATE_BUCKETS')

    date_buckets = [] # list of strings naming each date range used in the dataset
    app_date_map = {} # mapping from android app names to lists of dates
    app_malicious_map = {} # mapping from android app names to 1 or 0 for malware or goodware
    apps_per_bucket = {} # number of apps in each date range
    relevant_buckets = [] # subset of buckets that will be used (based on NUM_DATE_BUCKETS)
    all_apps = {} # mapping combining app_date_map and app_malicious_map using bits
    apps_found_per_bucket = {} # number of apps added to all_apps for each bucket
    root_dir = os.getcwd()

    num_apps_before = 0
    num_apps_after = 0
    num_file_not_found = 0
    fnfe = open('file_not_found_error', 'w')
    for i, directory in enumerate(['benign_apk', 'malicious_apk']):
        os.chdir(directory)
        for filename in glob.glob('*.apk'):
            #print('Processing ' + filename)
            try:
                os.chdir(filename[:-4])
                if os.path.exists('classes.dex'):
                    mtime = os.stat('classes.dex')
                else:
                    mtime = os.stat('resources.arsc')
                if mtime.st_mtime < 1222000000:
                    num_apps_before += 1
                if mtime.st_mtime > time.time():
                    num_apps_after += 1
            except FileNotFoundError:
                num_file_not_found += 1
                fnfe.write(filename + '\n')
                os.chdir(os.path.join(root_dir, directory))
                continue
            app_date_map[filename] = int(mtime.st_mtime)
            app_name = filename
            # make a one-hot bit vector of length 2. 1st bit set if malicious, otherwise 2nd bit
            app_malicious_map[app_name] = [1,0] if i else [0,1]
            os.chdir(os.pardir)
        os.chdir(root_dir)
    fnfe.close()

    # Android was released Sept. 23, 2008
    startdate = 1222000000
    secondsinmonth = 60 * 60 * 24 * 28
    enddate  = startdate + secondsinmonth
    while True:
        date_buckets.append(str(startdate)+"_to_"+str(enddate))
        # Apps can't have been made in the future
        if(enddate >= time.time()):
            break
        startdate = enddate
        enddate = startdate + secondsinmonth

    # Count the number of apps per date range so we can ensure there's an equal number in each
    for app_name in app_date_map:
        for bucket in date_buckets:     
            mtime = app_date_map[app_name]
            startdate = int(bucket.split("_to_")[0])
            enddate = int(bucket.split("_to_")[1])
            if (startdate <= mtime) and (mtime < enddate):
                if bucket not in apps_per_bucket:
                    apps_per_bucket[bucket] = app_malicious_map[app_name]
                else:
                    if app_malicious_map[app_name][0] == 1:
                        apps_per_bucket[bucket][0] += 1
                    else:
                        apps_per_bucket[bucket][1] += 1
                break
    with open('apps_per_bucket.json', 'w') as f:
        json.dump(apps_per_bucket, f)
    relevant_buckets = sorted(apps_per_bucket, key=lambda bucket: min(apps_per_bucket[bucket]), reverse=True)
    if len(relevant_buckets) > NUM_DATE_BUCKETS:
        relevant_buckets = relevant_buckets[:NUM_DATE_BUCKETS]
    apps_per_bucket_limit = min(apps_per_bucket[relevant_buckets[-1]]) # number of apps of each type (benign/malicious) of each bucket

    # Now add apps_per_bucket_limit apps from each bucket to all_apps
    for bucket in relevant_buckets:
        apps_found_per_bucket[bucket] = [0,0]
    for app_name in app_date_map:
        date_vector = []
        in_relevant_bucket = False
        this_bucket = ''
        for bucket in relevant_buckets:
            mtime = app_date_map[app_name]
            startdate = int(bucket.split("_to_")[0]) 
            enddate = int(bucket.split("_to_")[1])
            if (startdate <= mtime) and (mtime < enddate):
                date_vector.append(1)
                in_relevant_bucket = True
                this_bucket = bucket
            else:
                date_vector.append(0)
        malicious = (app_malicious_map[app_name] == [1,0])
        if in_relevant_bucket and apps_found_per_bucket[this_bucket][0 if malicious else 1] < apps_per_bucket_limit:
            apps_found_per_bucket[this_bucket][0 if malicious else 1] += 1
            all_apps[app_name] = {'vector': date_vector, 'malicious': app_malicious_map[app_name]}
    with open('app_date_vectors.json', 'w') as outfile:
        json.dump({'features': relevant_buckets, 'apps': all_apps}, outfile)
    print('Wrote data on ' + str(len(relevant_buckets)) + ' date buckets and ' + str(len(all_apps)) + ' apps to a file.')
    print('{} apps were before Android began and {} were after today'.format(num_apps_before, num_apps_after))
    print('{} classes.dex files were not found'.format(num_file_not_found))

if __name__=='__main__':
    main()
